Wilcoxon Test
plot-state: single normal sample from PC (BRCA1 vs
TN)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_min_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
b17 <- b17[2:length(rownames(b17)),]
#Amplifications
b17amp <- b17 %>%
mutate(type = gsub("\\..*","", b17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(state = as.numeric(state)) %>%
group_by(cell_group_name) %>%
mutate(`Summed States` = sum(state)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tn_output_dir_subset_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
t17 <- t17[2:length(rownames(t17)),]
#Amplifications
t17amp <- t17 %>%
mutate(type = gsub("\\..*","", t17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(state = as.numeric(state)) %>%
group_by(cell_group_name) %>%
mutate(`Summed States` = sum(state)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#combine dfs
amp <- rbind(b17amp, t17amp)
#generate plot
plot <- ggplot(amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
y = "Sum of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#output plot
plot

state: single wilcoxon test
#create separate b1 df
b1amp <- b17amp%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)
#create separate tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)
#make numeric
b1amp$state <- as.numeric(b1amp$`Summed States`)
tnamp$state <- as.numeric(tnamp$`Summed States`)
#xilcox test for tumors
w.tum.st <- wilcox.test(b1amp$`Summed States`, tnamp$state, alternative = "two.sided")
w.tum.st #W = 21200, p-value < 2.2e-16
##
## Wilcoxon rank sum test with continuity correction
##
## data: b1amp$`Summed States` and tnamp$state
## W = 24707, p-value = 6.729e-12
## alternative hypothesis: true location shift is not equal to 0
plot-length: single normal sample from PC (BRCA1 vs
TN)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/tn_b1_epi_output_dir_subset_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
b17 <- b17[2:length(rownames(b17)),]
#Amplifications
b17amp <- b17 %>%
mutate(type = gsub("\\..*","", b17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>%
group_by(cell_group_name) %>%
mutate(`Summed Length` = sum(Length)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tn_output_dir_subset_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
t17 <- t17[2:length(rownames(t17)),]
#Amplifications
t17amp <- t17 %>%
mutate(type = gsub("\\..*","", t17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>%
group_by(cell_group_name) %>%
mutate(`Summed Length` = sum(Length)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#combine dfs
amp <- rbind(b17amp, t17amp)
#violin plot
plot <- ggplot(amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
y = "Summed Length of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#plot
plot

length: single wilcoxon test
#create separate b1 df
b1amp <- b17amp%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)
#create separate tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)
#make numeric
b1amp$`Summed Length` <- as.numeric(b1amp$`Summed Length`)
tnamp$`Summed Length` <- as.numeric(tnamp$`Summed Length`)
#xilcox test for tumors
w.tum.ln <- wilcox.test(b1amp$`Summed Length`, tnamp$`Summed Length`, alternative = "two.sided")
w.tum.ln #W = 23113, p-value = 4.962e-14
##
## Wilcoxon rank sum test with continuity correction
##
## data: b1amp$`Summed Length` and tnamp$`Summed Length`
## W = 35624, p-value = 0.01283
## alternative hypothesis: true location shift is not equal to 0
plot-state: multiple samples from PC (B1 vs
pre-neoplastic)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_pc_multi/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
b17 <- b17[2:length(rownames(b17)),]
#Amplifications
b17amp <- b17 %>%
mutate(type = gsub("\\..*","", b17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(state = as.numeric(state)) %>%
group_by(cell_group_name) %>%
mutate(`Summed States` = sum(state)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#generate plot
bplot <- ggplot(b17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "Preneoplastic tissue (B1) and BRCA1 tumors (TN_B1)",
y = "Sum of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#output plot
bplot

state: multiple wilcoxon test
#seperate B1 and PN
b1amp <- b17amp %>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)
pnamp <- b17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)
#make numeric
b1amp$`Summed States` <- as.numeric(b1amp$`Summed States`)
pnamp$`Summed States` <- as.numeric(pnamp$`Summed States`)
#wilcox test for brca carriers
w.brca.st <- wilcox.test(b1amp$`Summed States`, pnamp$`Summed States`, alternative = "two.sided")
w.brca.st #W = 156334, p-value < 2.2e-16
##
## Wilcoxon rank sum test with continuity correction
##
## data: b1amp$`Summed States` and pnamp$`Summed States`
## W = 316796, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
plot-length: multiple samples from PC (B1 vs
pre-neoplastic)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_pc_multi/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
b17 <- b17[2:length(rownames(b17)),]
#Amplifications
b17amp <- b17 %>%
mutate(type = gsub("\\..*","", b17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>%
group_by(cell_group_name) %>%
mutate(`Summed Length` = sum(Length)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#violin plot
bplot <- ggplot(b17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "Preneoplastic tissue (B1) and BRCA1 tumors (TN_B1)",
y = "Summed Length of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#plot
bplot

length: multiple wilcoxon test
#seperate B1 and PN
#create separate b1 df
b1amp <- b17amp%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)
pnamp <- b17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)
#make numeric
b1amp$`Summed Length` <- as.numeric(b1amp$`Summed Length`)
pnamp$`Summed Length` <- as.numeric(pnamp$`Summed Length`)
#wilcox test for brca carriers
w.brca.ln <- wilcox.test(b1amp$`Summed Length`, pnamp$`Summed Length`, alternative = "two.sided")
w.brca.ln #W = 157272, p-value < 2.2e-16
##
## Wilcoxon rank sum test with continuity correction
##
## data: b1amp$`Summed Length` and pnamp$`Summed Length`
## W = 305974, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
plot-state: multiple samples from PC (TN vs normal
premenopausal)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tp_subset_output_dir/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
t17 <- t17[2:length(rownames(t17)),]
t17amp <- t17 %>%
mutate(type = gsub("\\..*","", t17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(state = as.numeric(state)) %>%
group_by(cell_group_name) %>%
mutate(`Summed States` = sum(state)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#generate plot
tplot <- ggplot(t17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "Premenopausal tissue (N) and TN Tumors (TN)",
y = "Sum of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#output plot
tplot

state: multiple wilcoxon test
#create separate tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)
tpamp <- t17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019", "N_0233", "N_0092", "N_0093", "N_0123", "N_0064", "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)
#make numeric
tnamp$`Summed States` <- as.numeric(tnamp$`Summed States`)
tpamp$`Summed States` <- as.numeric(tpamp$`Summed States`)
#wilcox test for brca carriers
w.tn.st <- wilcox.test(tnamp$`Summed States`, tpamp$`Summed States`, alternative = "two.sided")
w.tn.st #W = 1126640457, p-value = 0.6612
##
## Wilcoxon rank sum test with continuity correction
##
## data: tnamp$`Summed States` and tpamp$`Summed States`
## W = 40473, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
plot-length: multiple samples from PC (TN vs normal
premenopausal)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tp_subset_output_dir/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
t17 <- t17[2:length(rownames(t17)),]
#Amplifications
t17amp <- t17 %>%
mutate(type = gsub("\\..*","", t17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>%
group_by(cell_group_name) %>%
mutate(`Summed Length` = sum(Length)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#violin plot
tplot <- ggplot(t17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "Premenopausal tissue (N) and TN Tumors (TN)",
y = "Summed Length of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#plot
tplot

length: multiple wilcoxon test
#tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)
#tp df
tpamp <- t17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019", "N_0233", "N_0092", "N_0093", "N_0123", "N_0064", "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)
#make numeric
tnamp$`Summed Length` <- as.numeric(tnamp$`Summed Length`)
tpamp$`Summed Length` <- as.numeric(tpamp$`Summed Length`)
#wilcox test for brca carriers
w.tn.ln <- wilcox.test(tnamp$`Summed Length`, tpamp$`Summed Length`, alternative = "two.sided")
w.tn.ln #W = 1239947246, p-value < 2.2e-16
##
## Wilcoxon rank sum test with continuity correction
##
## data: tnamp$`Summed Length` and tpamp$`Summed Length`
## W = 36669, p-value = 2.808e-15
## alternative hypothesis: true location shift is not equal to 0
plot-state: single normal sample from PC (Pre-neo vs Human
Premeno)
#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]
#Amplifications
pn17amp <- pn17 %>%
mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(state = as.numeric(state)) %>%
group_by(cell_group_name) %>%
mutate(`Summed States` = sum(state)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#load data
tp17 <- read.table(file = "~/brca-infercnv/tp_output_dir_05_13/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]
tp.names <- c("N_0019", "N_0233", "N_0092", "N_0093", "N_0123",
"N_0064", "N_0169")
#Amplifications
tp17amp <- tp17 %>%
mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(state = as.numeric(state)) %>%
group_by(cell_group_name) %>%
mutate(`Summed States` = sum(state)) %>%
filter(`Sample Name` %in% tp.names) %>%
distinct(cell_group_name, .keep_all = TRUE)
#combine dfs
norm17amp <- rbind(pn17amp, tp17amp)
#generate plot
normplot <- ggplot(norm17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
y = "Sum of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#output plot
normplot

state: single wilcoxon test
#create separate preneo df
pnamp <- pn17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)
tpamp <- tp17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019", "N_0233", "N_0092", "N_0093", "N_0123",
"N_0064", "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)
#make numeric
pnamp$`Summed States` <- as.numeric(pnamp$`Summed States`)
tpamp$`Summed States` <- as.numeric(tpamp$`Summed States`)
#xilcox test for tumors
w.norm.st <- wilcox.test(pnamp$`Summed States`, tpamp$`Summed States`, alternative = "two.sided")
w.norm.st #W = 96990, p-value = 0.4548
##
## Wilcoxon rank sum test with continuity correction
##
## data: pnamp$`Summed States` and tpamp$`Summed States`
## W = 132752, p-value = 0.1886
## alternative hypothesis: true location shift is not equal to 0
plot-length: single normal sample from PC (Preneo vs Human
Premeno)
#extract data from brca1 preneo
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]
#Amplifications
pn17amp <- pn17 %>%
mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>%
group_by(cell_group_name) %>%
mutate(`Summed Length` = sum(Length)) %>%
distinct(cell_group_name, .keep_all = TRUE)
#extract data from tp meno
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#load data
tp17 <- read.table(file = "~/brca-infercnv/tp_output_dir_05_13/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
col.names = c("cell_group_name", "gene_region_name", "state", "gene", "chr", "start", "end"))
#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]
#Amplifications
tp17amp <- tp17 %>%
mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>%
filter(state > 3) %>%
rename(`Sample Name` = type) %>%
mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>%
group_by(cell_group_name) %>%
mutate(`Summed Length` = sum(Length)) %>%
distinct(cell_group_name, .keep_all = TRUE) %>%
filter(`Sample Name` != "N_1105_epi")
#combine dfs
norm17amp <- rbind(pn17amp, tp17amp)
#violin plot
normplot <- ggplot(norm17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
labs(
title = "Violin Plot of Copy Number Amplifications",
x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
y = "Summed Length of Amplifications"
) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 15) +
theme(legend.position = "none",
panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
panel.spacing = unit(1, "lines"),
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
#plot
normplot

length: single wilcoxon test
#create separate preneo df
pnamp <- pn17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)
tpamp <- tp17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019", "N_0233", "N_0092", "N_0093", "N_0123", "N_0064", "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)
#make numeric
pnamp$`Summed Length` <- as.numeric(pnamp$`Summed Length`)
tpamp$`Summed Length` <- as.numeric(tpamp$`Summed Length`)
#xilcox test for tumors
w.norm.ln <- wilcox.test(pnamp$`Summed Length`, tpamp$`Summed Length`, alternative = "two.sided")
w.norm.ln #W = 101848, p-value = 0.9706
##
## Wilcoxon rank sum test with continuity correction
##
## data: pnamp$`Summed Length` and tpamp$`Summed Length`
## W = 112481, p-value = 7.063e-08
## alternative hypothesis: true location shift is not equal to 0